
**********
* Readme *
**********

* This script 
* [1] create ids for households, families and individuals 
* [2] and merge the files, keeping the relevant variables.

* Note: Original POF sample size: 
*  57 920 household units
*  58 039 families
* 178 431 individuals

* Note: Constant prices and population totals as of 15 jan 2018.


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/2_1_build_pof_ind_attributes_${today}.smcl", replace


*********************************************
* MORADOR [master: hu_id fam_id and ind_id] *
*********************************************
use "${analysis}/data/source_files/pof/morador.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM NUM_UC COD_INFORMANTE

foreach var in `r(varlist)' {
	tostring `var', replace
}

* House unit identifier
gen hu_id = COD_UPA + "-" + NUM_DOM
label var hu_id "House unit identifier"

* Family unit identifier
gen fam_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC
label var fam_id "Family unit identifier"

* Individual identifier
gen ind_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC + "-" + COD_INFORMANTE
label var ind_id "Individual identifier"


* Set survey attributes
***********************
rename COD_UPA psu_id
label var psu_id "Survey primary sampling unit id"

* Import strata info
tostring ESTRATO_POF, generate(pof_strata)
sort pof_strata
merge m:1 pof_strata using "${analysis}/data/1_3_aux_strata_info.dta", keepusing(strata_id urban state region pop_region) nogenerate
drop ESTRATO_POF pof_strata

* Poststratification correction -- identical to PESO_FINAL
rename PESO pre_weight
survwgt poststratify pre_weight, by(region) totvar(pop_region) generate(pweight)
label var pweight "Survey sampling weight (w. poststratification)"

svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered)


* Set place attributes
**********************

* Regions (indicators)
gen byte region_RO_c = (region == 111)
gen byte region_RO_o = (region == 113)

gen byte region_AC_c = (region == 121)
gen byte region_AC_o = (region == 123)

gen byte region_AM_c = (region == 131)
gen byte region_AM_m = (region == 132)
gen byte region_AM_o = (region == 133)

gen byte region_RR_c = (region == 141)
gen byte region_RR_o = (region == 143)

gen byte region_PA_c = (region == 151)
gen byte region_PA_m = (region == 152)
gen byte region_PA_o = (region == 153)

gen byte region_AP_c = (region == 161)
gen byte region_AP_m = (region == 162)
gen byte region_AP_o = (region == 163)

gen byte region_TO_c = (region == 171)
gen byte region_TO_o = (region == 173)

gen byte region_MA_c = (region == 211)
gen byte region_MA_m = (region == 212)
gen byte region_MA_o = (region == 213)

gen byte region_PI_c = (region == 221)
gen byte region_PI_o = (region == 223)

gen byte region_CE_c = (region == 231)
gen byte region_CE_m = (region == 232)
gen byte region_CE_o = (region == 233)

gen byte region_RN_c = (region == 241)
gen byte region_RN_m = (region == 242)
gen byte region_RN_o = (region == 243)

gen byte region_PB_c = (region == 251)
gen byte region_PB_m = (region == 252)
gen byte region_PB_o = (region == 253)

gen byte region_PE_c = (region == 261)
gen byte region_PE_m = (region == 262)
gen byte region_PE_o = (region == 263)

gen byte region_AL_c = (region == 271)
gen byte region_AL_m = (region == 272)
gen byte region_AL_o = (region == 273)

gen byte region_SE_c = (region == 281)
gen byte region_SE_m = (region == 282)
gen byte region_SE_o = (region == 283)

gen byte region_BA_c = (region == 291)
gen byte region_BA_m = (region == 292)
gen byte region_BA_o = (region == 293)

gen byte region_MG_c = (region == 311)
gen byte region_MG_m = (region == 312)
gen byte region_MG_o = (region == 313)

gen byte region_ES_c = (region == 321)
gen byte region_ES_m = (region == 322)
gen byte region_ES_o = (region == 323)

gen byte region_RJ_c = (region == 331)
gen byte region_RJ_m = (region == 332)
gen byte region_RJ_o = (region == 333)

gen byte region_SP_c = (region == 351)
gen byte region_SP_m = (region == 352)
gen byte region_SP_o = (region == 353)

gen byte region_PR_c = (region == 411)
gen byte region_PR_m = (region == 412)
gen byte region_PR_o = (region == 413)

gen byte region_SC_c = (region == 421)
gen byte region_SC_m = (region == 422)
gen byte region_SC_o = (region == 423)

gen byte region_RS_c = (region == 431)
gen byte region_RS_m = (region == 432)
gen byte region_RS_o = (region == 433)

gen byte region_MS_c = (region == 501)
gen byte region_MS_o = (region == 503)

gen byte region_MT_c = (region == 511)
gen byte region_MT_m = (region == 512)
gen byte region_MT_o = (region == 513)

gen byte region_GO_c = (region == 521)
gen byte region_GO_m = (region == 522)
gen byte region_GO_o = (region == 523)

gen byte region_DF_c = (region == 531)

label var region_RO_c "RO, capital"
label var region_RO_o "RO, remaining areas"

label var region_AC_c "AC, capital"
label var region_AC_o "AC, remaining areas"

label var region_AM_c "AM, capital"
label var region_AM_m "AM, metropolitan area"
label var region_AM_o "AM, remaining areas"

label var region_RR_c "RR, capital"
label var region_RR_o "RR, remaining areas"

label var region_PA_c "PA, capital"
label var region_PA_m "PA, metropolitan area"
label var region_PA_o "PA, remaining areas"

label var region_AP_c "AP, capital"
label var region_AP_m "AP, metropolitan area"
label var region_AP_o "AP, remaining areas"

label var region_TO_c "TO, capital"
label var region_TO_o "TO, remaining areas"

label var region_MA_c "MA, capital"
label var region_MA_m "MA, metropolitan area"
label var region_MA_o "MA, remaining areas"

label var region_PI_c "PI, capital"
label var region_PI_o "PI, remaining areas"

label var region_CE_c "CE, capital"
label var region_CE_m "CE, metropolitan area"
label var region_CE_o "CE, remaining areas"

label var region_RN_c "RN, capital"
label var region_RN_m "RN, metropolitan area"
label var region_RN_o "RN, remaining areas"

label var region_PB_c "PB, capital"
label var region_PB_m "PB, metropolitan area"
label var region_PB_o "PB, remaining areas"

label var region_PE_c "PE, capital"
label var region_PE_m "PE, metropolitan area"
label var region_PE_o "PE, remaining areas"

label var region_AL_c "AL, capital"
label var region_AL_m "AL, metropolitan area"
label var region_AL_o "AL, remaining areas"

label var region_SE_c "SE, capital"
label var region_SE_m "SE, metropolitan area"
label var region_SE_o "SE, remaining areas"

label var region_BA_c "BA, capital"
label var region_BA_m "BA, metropolitan area"
label var region_BA_o "BA, remaining areas"

label var region_MG_c "MG, capital"
label var region_MG_m "MG, metropolitan area"
label var region_MG_o "MG, remaining areas"

label var region_ES_c "ES, capital"
label var region_ES_m "ES, metropolitan area"
label var region_ES_o "ES, remaining areas"

label var region_RJ_c "RJ, capital"
label var region_RJ_m "RJ, metropolitan area"
label var region_RJ_o "RJ, remaining areas"

label var region_SP_c "SP, capital"
label var region_SP_m "SP, metropolitan area"
label var region_SP_o "SP, remaining areas"

label var region_PR_c "PR, capital"
label var region_PR_m "PR, metropolitan area"
label var region_PR_o "PR, remaining areas"

label var region_SC_c "SC, capital"
label var region_SC_m "SC, metropolitan area"
label var region_SC_o "SC, remaining areas"

label var region_RS_c "RS, capital"
label var region_RS_m "RS, metropolitan area"
label var region_RS_o "RS, remaining areas"

label var region_MS_c "MS, capital"
label var region_MS_o "MS, remaining areas"

label var region_MT_c "MT, capital"
label var region_MT_m "MT, metropolitan area"
label var region_MT_o "MT, remaining areas"

label var region_GO_c "GO, capital"
label var region_GO_m "GO, metropolitan area"
label var region_GO_o "GO, remaining areas"

label var region_DF_c "DF, capital"


* Individual attributes
***********************

* Gender (indicator)
gen male = inlist(V0404, 1)
label define male 0 "Female" 1 "Male"
label values male male
label var male "Gender male"
svy: proportion male

* Race (indicator)
svy: proportion V0405
gen white = inlist(V0405, 1, 3)
label define white 0 "Non white" 1 "White"
label values white white
label var white "Race white"
svy: proportion white

* Race gender interaction (categorical)
gen     race_gender = 1 if inlist(V0405, 2, 4, 5, 9) & inlist(V0404, 2)  // nonwhite female
replace race_gender = 2 if inlist(V0405, 1, 3)       & inlist(V0404, 2)  // white female
replace race_gender = 3 if inlist(V0405, 2, 4, 5, 9) & inlist(V0404, 1)  // nonwhite male
replace race_gender = 4 if inlist(V0405, 1, 3)       & inlist(V0404, 1)  // white male

label define rg 1 "Nonwhite female" 2 "White female" 3 "Nonwhite male"  4 "White male"
label values race_gender rg
label var race_gender "Race and gender" 

* Race gender interaction (indicators)
gen rg_1_nonwhite_female = (race_gender == 1)
gen rg_2_white_female    = (race_gender == 2)
gen rg_3_nonwhite_male   = (race_gender == 3)
gen rg_4_white_male      = (race_gender == 4)

label var rg_1_nonwhite_female "Nonwhite female"
label var rg_2_white_female    "White female"
label var rg_3_nonwhite_male   "Nonwhite male"
label var rg_4_white_male      "White male"

* Age (numeric)
rename V0403 years_age
label var years_age "Age, in years"

* Age (categorical)
recode years_age ///
  (0/24=1   "Under 25") ///
  (25/34=2  "25-34") ///
  (35/44=3  "35-44") ///  
  (45/54=4  "45-54") ///
  (55/64=5  "55-64") ///
  (65/150=6 "Above 64"), gen(age)

label var age "Age groups"

* Years of education (numeric)
rename ANOS_ESTUDO years_educ
label var years_educ "Schooling, in years"

* Highest educational level (categorical)

* No school
gen educ_default = 1 if ///
  inlist(V0415, 2) & inlist(V0424, 2) | ///
  inlist(V0419, 1, 2, 3) | ///
  inlist(V0419, 4) & inlist(V0422, 1, 13) | ///
  inlist(V0425, 1, 2) | ///
  inlist(V0425, 3, 4) & inlist(V0430, 2) | ///
  inlist(V0425, 7) & inlist(V0426, 2) & inlist(V0428, 2, 3)

* Primary school incomplete
replace educ_default = 2 if ///
  inlist(V0419, 4) & inlist(V0422, 2, 3, 4, 5, 6, 7, 8, 9) | ///
  inlist(V0419, 5) & (inlist(V0422, 1, 2, 3, 4, 5, 6, 7, 8) | inlist(V0422, 13)) | ///
  inlist(V0425, 3, 4) & inlist(V0430, 1) | ///
  inlist(V0425, 5) | ///
  inlist(V0425, 6, 8) & inlist(V0428, 2) | ///
  inlist(V0425, 6) & inlist(V0428, 3) & inlist(V0430, 2)| ///
  inlist(V0425, 6) & inlist(V0429, 1, 2, 3)| ///
  inlist(V0425, 6) & inlist(V0429, 4) & inlist(V0430, 2) | ///
  inlist(V0425, 7) & inlist(V0426, 1) & inlist(V0428, 2, 3) | ///
  inlist(V0425, 7) & inlist(V0426, 1) & inlist(V0429, 1, 2, 3, 4, 5, 6, 7) | ///
  inlist(V0425, 7) & inlist(V0426, 2) & inlist(V0429, 1, 2, 3, 4, 5, 6, 7, 8) | ///
  inlist(V0425, 7, 8) & inlist(V0428, 3) & inlist(V0430, 2) | ///
  inlist(V0425, 8) & inlist(V0428, 3) & inlist(V0430, 2) | ///
  inlist(V0425, 8) & inlist(V0429, 1, 2, 3, 4, 5, 6, 7)

* Primary school
replace educ_default = 3 if ///
  inlist(V0419, 6, 7) & inlist(V0422, 1, 13) | ///
  inlist(V0425, 6) & inlist(V0428, 3) & inlist(V0430, 1) | ///
  inlist(V0425, 6) & inlist(V0429, 4) & inlist(V0430, 1) | ///
  inlist(V0425, 6) & inlist(V0429, 5) | ///
  inlist(V0425, 7) & inlist(V0426, 1) & inlist(V0429, 8) | ///
  inlist(V0425, 7) & inlist(V0426, 2) & inlist(V0429, 9) | ///
  inlist(V0425, 7, 8) & inlist(V0428, 3) & inlist(V0430, 1) | ///
  inlist(V0425, 8) & inlist(V0428, 3) & inlist(V0430, 1) | ///
  inlist(V0425, 8) & inlist(V0429, 8) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0428, 2) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0428, 3) & inlist(V0430, 2)

* High school incomplete
replace educ_default = 4 if ///
  inlist(V0419, 6) & inlist(V0422, 2, 3, 4) | ///
  inlist(V0419, 7) & inlist(V0422, 2, 3) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0429, 1, 2) | ///
  inlist(V0425, 9, 10) & inlist(V0429, 3) & inlist(V0430, 2)

* High school
replace educ_default = 5 if ///
  inlist(V0419, 8) & inlist(V0421, 1) & inlist(V0422, 1, 2) & inlist(V0423, 2) | ///
  inlist(V0419, 8) & inlist(V0421, 2, 3) & inlist(V0422, 1) & inlist(V0423, 2) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0428, 3) & inlist(V0430, 1) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0429, 3) & inlist(V0430, 1) | ///
  inlist(V0425, 9, 10, 11) & inlist(V0429, 4) | ///
  inlist(V0425, 11) & inlist(V0429, 3) | ///
  inlist(V0425, 12) & inlist(V0427, 1) & inlist(V0429, 1) | ///
  inlist(V0425, 12) & inlist(V0428, 2)

* College incomplete
replace educ_default = 6 if ///
  inlist(V0419, 8) & inlist(V0421, 1) & inlist(V0422, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) & inlist(V0423, 2) | ///
  inlist(V0419, 8) & inlist(V0421, 2, 3) & inlist(V0422, 2, 3, 4, 5, 6) & inlist(V0423, 2) | ///
  inlist(V0425, 12) & inlist(V0427, 1) & inlist(V0429, 2, 3) | ///
  inlist(V0425, 12) & inlist(V0427, 1) & inlist(V0429, 4, 5, 6, 7, 8, 9, 10, 11) & inlist(V0430, 2) | ///
  inlist(V0425, 12) & inlist(V0427, 2, 3) & inlist(V0429, 1) | ///
  inlist(V0425, 12) & inlist(V0427, 2, 3) & inlist(V0429, 2, 3, 4, 5) & inlist(V0430, 2)

* College complete or above
replace educ_default = 7 if ///
  inlist(V0419, 8) & inlist(V0423, 1) | ///
  inlist(V0419, 9, 10, 11) | ///
  inlist(V0425, 12) & inlist(V0427, 1) & inlist(V0429, 4, 5, 6, 7, 8, 9, 10, 11) & inlist(V0430, 1) | ///
  inlist(V0425, 12) & inlist(V0427, 1) & inlist(V0429, 12) | ///
  inlist(V0425, 12) & inlist(V0427, 2, 3) & inlist(V0429, 2, 3, 4, 5) & inlist(V0430, 1) | ///
  inlist(V0425, 12) & inlist(V0427, 2, 3) & inlist(V0429, 6) | ///
  inlist(V0425, 13, 14, 15)

* Group into main categories
gen     educ = 1 if inlist(educ_default, 1, 2) // No school (or primary school incomplete)
replace educ = 2 if inlist(educ_default, 3, 4) // Primary school (or high school incomplete)
replace educ = 3 if inlist(educ_default, 5, 6) // High school (or college incomplete)
replace educ = 4 if inlist(educ_default, 7)    // College (or above)
drop educ_default

label define educ ///
  1 "Less than primary school" ///
  2 "Primary school" ///
  3 "High school" ///
  4 "College or above"

label values educ educ
label var educ "Highest educational level concluded"

tab years_educ educ

* Age education interaction (indicators)
tab age educ, missing

tab age, gen(age_)
label var age_1 "Under 25"
label var age_2 "25-34"
label var age_3 "35-44"
label var age_4 "45-54"
label var age_5 "55-64"
label var age_6 "Over 64"
describe age_*

tab educ, gen(educ_)
label var educ_1 "Less than primary school"
label var educ_2 "Primary school"
label var educ_3 "High school"
label var educ_4 "College or above"
describe educ_*

local i = 0
foreach age of varlist age_* {
   local i = `i' + 1
   local j = 0
   foreach educ of varlist educ_* {
     local j = `j' + 1
     gen byte ae_`i'_`j' = `age' * `educ' 
   }
}

label var ae_1_1 "Under 25, less than primary school"
label var ae_1_2 "Under 25, primary school"
label var ae_1_3 "Under 25, high school"
label var ae_1_4 "Under 25, college or above"

label var ae_2_1 "25-34, less than primary school"
label var ae_2_2 "25-34, primary school"
label var ae_2_3 "25-34, high school"
label var ae_2_4 "25-34, college or above"

label var ae_3_1 "35-44, less than primary school"
label var ae_3_2 "35-44, primary school"
label var ae_3_3 "35-44, high school"
label var ae_3_4 "35-44, college or above"

label var ae_4_1 "45-54, less than primary school"
label var ae_4_2 "45-54, primary school"
label var ae_4_3 "45-54, high school"
label var ae_4_4 "45-54, college or above"

label var ae_5_1 "55-64, less than primary school"
label var ae_5_2 "55-64, primary school"
label var ae_5_3 "55-64, high school"
label var ae_5_4 "55-64, college or above"

label var ae_6_1 "Over 64, less than primary school"
label var ae_6_2 "Over 64, primary school"
label var ae_6_3 "Over 64, high school"
label var ae_6_4 "Over 64, college or above"

* Currently attending school (categorical)
gen attending_school = 0 
replace attending_school = 1 if inlist(V0419, 1, 2, 3, 4, 5, 6, 7)
replace attending_school = 2 if inlist(V0419, 8, 9, 10, 11)

label define at_school ///
  0 "Not attending school" ///
  1 "Attending school" ///
  2 "Attending college or above"

label values attending_school at_school
label var attending_school "Currently attending school"

tab attending_school, missing

* Currently attending school (indicators)
gen at_school = (attending_school == 1)
gen at_college = (attending_school == 2)

label var at_school  "Attending school"
label var at_college "Attending college or above"


* Health plan
*************
gen health_plan = (V0406 == 1)
label var health_plan "Has health plan"


* Credit attributes
*******************
gen no_credit_card = (V0409 == 0 | missing(V0409))
label var no_credit_card "No credit card"

gen no_current_account = (V0410 == 0 | missing(V0410))
label var no_current_account "No current account"

gen no_overdraft_facility = (V0411 == 0 | missing(V0411))
label var no_overdraft_facility "No overdraft facility"

gen no_savings_account = (V0413 == 0 | missing(V0413))
label var no_savings_account "No savings account"


* Family unit income
********************
rename RENDA_TOTAL gross_inc_total
label var gross_inc_total "Total gross household income"

rename RENDA_DISP_PC net_inc_pc
label var net_inc_pc "Available household income per capita"


* Family-related attributes
***************************
sort fam_id ind_id

* Note: Following POF definitions, we use "family" to denote a colective of relatives who live together and share a common budget.

* Family position (categorical)
gen     family_position = 1   if inrange(V0306, 1, 1) // head
replace family_position = 2   if inrange(V0306, 2, 3) // partner
replace family_position = 300 if inrange(V0306, 4, 6) // son or daughter
replace family_position = 401 if inrange(V0306, 7, 17) & years_age < 22 // other young hh members
replace family_position = 402 if inrange(V0306, 7, 17) & years_age > 21 & years_age < 65 // other adult hh members
replace family_position = 403 if inrange(V0306, 7, 17) & years_age > 64 // other senior hh members
replace family_position = 900 if inrange(V0306, 18, 19) // domestic workers and their relatives

label define family_position 1 "Head" 2 "Partner" 300 "Son or daughter" 401 "Other young member" 402 "Other adult member" 403 "Other senior member" 900 "Domestic worker"
label values family_position family_position
label var family_position "Family position"

tab family_position, missing


* Differentiate between "with/no partner" and "with kids/no kids"
*****************************************************************

* Is there a partner in the family?
bysort fam_id: egen has_partner = sum(family_position == 2)
label define has_partner 1 "with partner" 0 "no partner"
label values has_partner has_partner
label var has_partner "Family with a partner"
tabulate has_partner, missing

* Is there a child in the family?
bysort fam_id: egen has_kids = sum(family_position == 300)
replace has_kids = (has_kids !=0)
label define has_kids 1 "with child" 0 "no child"
label values has_kids has_kids
label var has_kids "Family with children"
tabulate has_kids, missing

codebook family_position

replace family_position = 100 if family_position == 1 & has_partner == 0 & has_kids == 0
replace family_position = 101 if family_position == 1 & has_partner == 0 & has_kids == 1

replace family_position = 110 if family_position == 1 & has_partner == 1 & has_kids == 0
replace family_position = 111 if family_position == 1 & has_partner == 1 & has_kids == 1

replace family_position = 210 if family_position == 2 & has_kids == 0
replace family_position = 211 if family_position == 2 & has_kids == 1

label define family_position 100 "Head, no partner, no kids", add
label define family_position 101 "Head, no partner, with kids", add

label define family_position 110 "Head, with partner, no kids", add
label define family_position 111 "Head, with partner, with kids", add

label define family_position 210 "Partner, no kids", add
label define family_position 211 "Partner, with kids", add

codebook family_position

* Household positions as indicators
gen byte fp_h_wp_nk = (family_position == 110)
gen byte fp_h_wp_wk = (family_position == 111)

gen byte fp_h_np_nk = (family_position == 100)
gen byte fp_h_np_wk = (family_position == 101)

gen byte fp_p_nk = (family_position == 210)
gen byte fp_p_wk = (family_position == 211)

gen byte fp_child = (family_position == 300)

gen byte fp_oy = (family_position == 401)
gen byte fp_oa = (family_position == 402)
gen byte fp_os = (family_position == 403)

label var fp_h_np_nk "Head, no partner, no kids"
label var fp_h_np_wk "Head, no partner, with kids"

label var fp_h_wp_nk "Head, with partner, no kids"
label var fp_h_wp_wk "Head, with partner, with kids"

label var fp_p_nk "Partner, no kids"
label var fp_p_wk "Partner, with kids"

label var fp_child "Child"

label var fp_oy "Other young hh member"
label var fp_oa "Other adult hh member"
label var fp_os "Other senior hh member"

describe fp_*


* Other family composition variables
************************************

* How many family members?
bysort fam_id: egen family_size = sum(!inlist(V0306, 18, 19))
label var family_size "Family size (excl. domestic workers)"

* How many kids in the family? (age criteria)
bysort fam_id: egen n_kids = sum(years_age < 15 & !inlist(V0306, 18, 19))
label var n_kids "N. kids (less than 15 years old)"

* How many young family members? (age criteria)
bysort fam_id: egen n_youngs = sum(years_age > 14 & years_age < 22 & !inlist(V0306, 18, 19))
label var n_youngs "N. young members (15-21)"

* How many adult family members? (age criteria)
bysort fam_id: egen n_adults = sum(years_age > 21 & years_age < 65 & !inlist(V0306, 18, 19))
label var n_adults "N. adult members (22-64)"

* How many senior family members? (age criteria)
bysort fam_id: egen n_seniors = sum(years_age > 64 & !inlist(V0306, 18, 19))
label var n_seniors "N. elderly members (65+)"


* Wrap up and export
********************
global export_set ///
  state region strata_id urban psu_id pweight hu_id fam_id ind_id ///
  family_position male white race_gender years_age age years_educ educ attending_school ///
  family_size n_kids n_youngs n_adults n_seniors ///
  health_plan no_credit_card no_current_account no_savings_account no_overdraft_facility ///
  gross_inc_total net_inc_pc /// 
  rg_* fp_* age_* educ_* ae_* at_* region_*

keep $export_set
order $export_set
sort ind_id

duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_morador_summary.dta", replace


*********************
* DOMICILIO [hu_id] *
*********************
use "${analysis}/data/source_files/pof/domicilio.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM
 
foreach var in `r(varlist)' {
	tostring `var', replace
}

* House unit identifier
gen hu_id = COD_UPA + "-" + NUM_DOM
label var hu_id "House unit identifier"

* Housekeeping
drop NUM_DOM


* House characteristics
***********************
gen n_sleep_rooms = V0206
label var n_sleep_rooms "Number of sleeping rooms"

gen piped_water_lack = (V0208 != 1)
label var piped_water_lack "No steady piped water"

gen waste_lack = !inlist(V0213, 1, 2)
label var waste_lack "No public waste collection"

gen energy_lack = (V0215 != 1)
label var energy_lack "No steady electricity"

gen house_own = inlist(V0217, 1, 2)
label var house_own "Own house"

gen house_rented = inlist(V0217, 3)
label var house_rented "Rented house"

gen house_other = inlist(V0217, 4, 5, 6, 7)
label var house_other "Rent-free accomodation"

gen unpaved_street = (V0220 == 2)
label var unpaved_street "Unpaved street"

* Food insecurity
gen food_insec = V6199

label define food_insec ///
  1 "No food insecurity" ///
  2 "Some food insecurity" ///
  3 "Moderate food insecurity" ///
  4 "Severe food insecurity"

label values food_insec food_insec
label var food_insec "Family food insecurity status"


* Wrap up and export
********************
keep  hu_id house_own house_rented house_other n_sleep_rooms piped_water_lack waste_lack energy_lack unpaved_street food_insec
order hu_id house_own house_rented house_other n_sleep_rooms piped_water_lack waste_lack energy_lack unpaved_street food_insec
sort  hu_id
duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_domicilio_summary.dta", replace


*********************************************
* INVENTARIO DA UNIDADE DE CONSUMO [fam_id] *
*********************************************
use "${analysis}/data/source_files/pof/inventario.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM NUM_UC

foreach var in `r(varlist)' {
	tostring `var', replace
}

* Family unit identifier
gen fam_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC
label var fam_id "Family unit identifier"

* Housekeeping
drop NUM_DOM NUM_UC


* Selected items
****************
gen temp_fridge = (V9001 == 1400301 | V9001 == 1400401)
bysort fam_id: egen has_fridge = max(temp_fridge)
label var has_fridge "Has fridge"
drop temp_fridge

gen temp_tv = (V9001 == 1401301 | V9001 == 1401401)
bysort fam_id: egen has_tv = max(temp_tv)
label var has_tv "Has TV"
drop temp_tv

gen temp_pc = (V9001 == 1401901)
bysort fam_id: egen has_pc = max(temp_pc)
label var has_pc "Has computer"
drop temp_pc

gen temp_car = (V9001 == 1403001)
bysort fam_id: egen has_car = max(temp_car)
label var has_car "Has car"
drop temp_car


* Selected items (new condition)
********************************
gen temp_fridge = (V9001 == 1400301 | V9001 == 1400401) & (V9012 == 1) & (V1404 > 2014)
bysort fam_id: egen has_new_fridge = max(temp_fridge)
label var has_new_fridge "Has new fridge (less than 4 yrs old)"
drop temp_fridge

gen temp_tv = (V9001 == 1401301 | V9001 == 1401401) & (V9012 == 1) & (V1404 > 2014)
bysort fam_id: egen has_new_tv = max(temp_tv)
label var has_new_tv "Has new TV (less than 4 yrs old)"
drop temp_tv

gen temp_pc = (V9001 == 1401901) & (V9012 == 1) & (V1404 > 2014)
bysort fam_id: egen has_new_pc = max(temp_pc)
label var has_new_pc "Has new computer (less than 4 yrs old)"
drop temp_pc

gen temp_car = (V9001 == 1403001) & (V9012 == 1) & (V1404 > 2014)
bysort fam_id: egen has_new_car = max(temp_car)
label var has_new_car "Has new car (less than 4 yrs old)"
drop temp_car


* Wrap up and export
********************
keep  fam_id has_*
order fam_id has_*
sort  fam_id
duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_inventario_summary.dta", replace


******************************
* CONDICOES DE VIDA [fam_id] *
******************************
use "${analysis}/data/source_files/pof/condicoes_vida.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM NUM_UC COD_INFORMANTE

foreach var in `r(varlist)' {
	tostring `var', replace
}

* Family unit identifier
gen fam_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC
label var fam_id "Family unit identifier"


* How hard is it to make ends meet
**********************************
gen     ends_meet = 1 if V6101 == 6 // very easy
replace ends_meet = 2 if V6101 == 5 // easy
replace ends_meet = 3 if V6101 == 4 // somewhat easy
replace ends_meet = 4 if V6101 == 3 // somewhat hard
replace ends_meet = 5 if V6101 == 2 // hard
replace ends_meet = 6 if V6101 == 1 // very hard

label define ends_meet ///
  1 "Very easy to make ends meet" ///
  2 "Easy to make ends meet" ///
  3 "Somewhat easy to make ends meet" ///
  4 "Somewhat hard to make ends meet" ///
  5 "Hard to make ends meet" ///
  6 "Very hard to make ends meet"

label values ends_meet ends_meet
label var ends_meet "How hard is it to make ends meet"


* Life conditions variables
***************************
gen good_cond_food = (V61041 == 1)
label var good_cond_food "Good food conditions"

gen bad_cond_food = (V61041 == 3)
label var bad_cond_food "Poor food conditions"

gen good_cond_housing = (V61042 == 1)
label var good_cond_housing "Good housing conditions"

gen bad_cond_housing = (V61042 == 3)
label var bad_cond_housing "Poor housing conditions"

gen good_cond_clothing = (V61043 == 1)
label var good_cond_clothing "Good clothing conditions"

gen bad_cond_clothing = (V61043 == 3)
label var bad_cond_clothing "Poor clothing conditions"

gen good_cond_education = (V61044 == 1)
label var good_cond_education "Good education conditions"

gen bad_cond_education = (V61044 == 3)
label var bad_cond_education "Poor education conditions"

gen good_cond_health = (V61045 == 1)
label var good_cond_health "Good health conditions"

gen bad_cond_health = (V61045 == 3)
label var bad_cond_health "Poor health conditions"

gen good_cond_leisure = (V61046 == 1)
label var good_cond_leisure "Good leisure conditions"

gen bad_cond_leisure = (V61046 == 3)
label var bad_cond_leisure "Poor leisure conditions"


* Insfrastructure variables
***************************
gen good_infr_water = (V61051 == 1)
label var good_infr_water "Good water supply"

gen bad_infr_water = inlist(V61051, 3, 4)
label var bad_infr_water "Poor water supply"

gen good_infr_electricity = (V61052 == 1)
label var good_infr_electricity "Good electricity supply"

gen bad_infr_electricity = inlist(V61052, 3, 4)
label var bad_infr_electricity "Poor electricity supply"

gen good_infr_light = (V61053 == 1)
label var good_infr_light "Good public lighting"

gen bad_infr_light = inlist(V61053, 3, 4)
label var bad_infr_light "Poor public lighting"

gen good_infr_waste = (V61054 == 1)
label var good_infr_waste "Good waste collection"

gen bad_infr_waste = inlist(V61054, 3, 4)
label var bad_infr_waste "Poor waste collection"

gen good_infr_street = (V61055 == 1)
label var good_infr_street "Good public street cleaning"

gen bad_infr_street = inlist(V61055, 3, 4)
label var bad_infr_street "Poor public street cleaning"

gen good_infr_drainage = (V61056 == 1)
label var good_infr_drainage "Good stormwater drainage"

gen bad_infr_drainage = inlist(V61056, 3, 4)
label var bad_infr_drainage "Poor stormwater drainage"

gen good_infr_sewage = (V61057 == 1)
label var good_infr_sewage "Good sewage system"

gen bad_infr_sewage = inlist(V61057, 3, 4)
label var bad_infr_sewage "Poor sewage system"

gen good_infr_transport = (V61058 == 1)
label var good_infr_transport "Good public transportation"

gen bad_infr_transport = inlist(V61058, 3, 4)
label var bad_infr_transport "Poor public transportation"


* House conditions
******************
gen house_lack_space = (V61061 == 1)
label var house_lack_space "Insufficient space"

gen house_lack_light = (V61062 == 1)
label var house_lack_light "Poor lighting"

gen house_dampness = (V61063 == 1 | V61064 == 1 | V61065 == 1)
label var house_dampness "Presence of leakages or dampness"

gen house_pest = (V61066 == 1)
label var house_pest "Presence of domestic pests"

gen house_pollution = (V61067 == 1 | V61068 == 1)
label var house_pollution "Noise or air pollution"
		
gen house_flood = (V61069 == 1 | V610610 == 1)
label var house_flood "Subject to flooding or landslides"

gen house_violence = (V610611 == 1)
label var house_violence "Violent area"

gen house_payment_difficulty = (V61071 == 1 | V61072 == 1 | V61073 == 1)
label var house_payment_difficulty "Missed a domestic bill last year"


* Food conditions
*****************
gen food_worry = (V6108 == 1)
label var food_worry "Worried about food shortage"

gen food_short = (V6109 == 1) | (V6110 == 1) | (V6111 == 1)
label var food_short "Experienced food shortage"

gen food_adult_hunger = (V6112 == 1) | (V6113 == 1) | (V6114 == 1) | (V6115 == 1) 
label var food_adult_hunger "Adult (above 18) ate less, skipped meals, faced hunger"

gen food_young_hunger = (V6116 == 1) | (V6117 == 1) | (V6118 == 1) | (V6119 == 1) | (V6120 == 1) | (V6121 == 1) 
label var food_young_hunger "Children (below 18) ate less, skipped meals, faced hunger"

gen food_hunger = (food_adult_hunger == 1 | food_young_hunger == 1)
label var food_hunger "Family member faced hunger"


* Wrap up and export
********************
global export_set ///
  fam_id ends_meet good_cond_* bad_cond_* good_infr_* bad_infr_* house_* food_*

keep $export_set
order $export_set
sort fam_id
duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_condicoes_vida_summary.dta", replace


**************************
* RENDIMENTO DO TRABALHO *
**************************
use "${analysis}/data/source_files/pof/rendimento_trabalho.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM NUM_UC COD_INFORMANTE

foreach var in `r(varlist)' {
	tostring `var', replace
}

* Individual identifier
gen ind_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC + "-" + COD_INFORMANTE
label var ind_id "Individual identifier"


* Summarize individual labor info (category, income, hours) from main job, latest pay
*************************************************************************************

* Keep only MAIN JOB 
keep if SUB_QUADRO == 1

* Work states
label define work_state ///
  1 "Own-account worker" ///
  2 "Employee" ///
  3 "Employer" ///
  4 "Aux worker" ///
  5 "Unemployed" ///
  6 "Inactive" ///
  7 "Below working age"

* Main job, default categories
gen     main_job_ibge = 1 if inlist(V5302, 6)          // oaw
replace main_job_ibge = 2 if inlist(V5302, 1, 2, 3, 4) // employee (incl. domestic workers)
replace main_job_ibge = 3 if inlist(V5302, 5)          // employer 
replace main_job_ibge = 4 if inlist(V5302, 7)          // aux

label values main_job_ibge work_state
label var main_job_ibge "Main occupational status (IBGE default)"

* Main job, my categories (domestic worker as OAW)
gen     main_job = 1 if inlist(V5302, 1, 6)            // oaw (incl. domestic workers)
replace main_job = 2 if inlist(V5302, 2, 3, 4)         // employee
replace main_job = 3 if inlist(V5302, 5)               // employer 
replace main_job = 4 if inlist(V5302, 7)               // aux

label values main_job work_state
label var main_job "Main occupational status"

* Reality check
tab main_job main_job_ibge

* Main job, gross income, latest pay
gen main_job_gross_inc = V8500_DEFLA
label var main_job_gross_inc "Main job, gross income, latest pay"

* Deductions
replace V531112_DEFLA = 0 if V531112_DEFLA == .
replace V531122_DEFLA = 0 if V531122_DEFLA == .
replace V531132_DEFLA = 0 if V531132_DEFLA == .

gen main_job_deductions = V531112_DEFLA + V531122_DEFLA + V531132_DEFLA
label var main_job_deductions "Main job, deductions and taxes, latest pay"


* Working hours
***************
gen main_job_work_hours = 4 * V5314
label var main_job_work_hours "Monthly work hours, main job"

* Full time
gen main_job_full_time = (main_job_work_hours > 100)
replace main_job_full_time = . if missing(main_job_work_hours)
label var main_job_full_time "Over 100 monthly hours in the main job" 

* Time in transit
gen transit_time = V5315
replace transit_time = . if transit_time == 9

label define transit_time ///
  1 "Up to five minutes" ///
  2 "6 to 30 minutes" ///
  3 "30 minutes to one hour" ///
  4 "One to two hours" ///
  5 "More than 2 hours"

label values transit_time transit_time
label var transit_time "Commute time to main job"


* Month of reference
********************
gen main_job_month = V9010
label var main_job_month "Most recent month for main job"


* Job position
**************
rename V53011 occupation 
label var occupation "See ocupação COD (V53011)"


* Job sector
************
rename V53061 sector
label var sector "See atividade CNAE (V53061)"


* Formality status
******************
gen formal = (V5304 == 1 | V5305 == 1)
label var formal "Formal occupation ('carteira assinada' or retirement contribution)"

tab main_job formal, missing


* Wrap up and export
********************
global export_set ///
  ind_id main_job_ibge main_job main_job_gross_inc main_job_deductions ///
  main_job_work_hours main_job_full_time main_job_month transit_time occupation sector formal
 
keep  $export_set
order $export_set
sort  ind_id
duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_ind_job_inc_summary.dta", replace


*******************************
* OUTROS RENDIMENTOS [ind_id] *
*******************************
use "${analysis}/data/source_files/pof/outros_rendimentos.dta", clear

* Code selected variables as string
ds COD_UPA NUM_DOM NUM_UC COD_INFORMANTE

foreach var in `r(varlist)' {
	tostring `var', replace
}

* Individual identifier
gen ind_id = COD_UPA + "-" + NUM_DOM + "-" + NUM_UC + "-" + COD_INFORMANTE
label var ind_id "Individual identifier"


* Aggregate other work income (extra hours, performance pay, other bonuses), latest pay
***************************************************************************************

* Import categories
merge m:1 V9001 using "${analysis}/data/1_3_aux_pof_map.dta", keepusing(inc_cat code)
drop if _merge == 2
keep  ind_id QUADRO V9001 V9011 FATOR_ANUALIZACAO inc_cat V8500_DEFLA V8501_DEFLA 
order ind_id QUADRO V9001 V9011 FATOR_ANUALIZACAO inc_cat V8500_DEFLA V8501_DEFLA 
sort  ind_id QUADRO V9001

* Focus on other work income
keep if inc_cat == "Other work income"

* Value of other income itens (monthly average)
gen     inc_value = (V8500_DEFLA         * FATOR_ANUALIZACAO)/12
replace inc_value = (V8500_DEFLA * V9011 * FATOR_ANUALIZACAO)/12 if (QUADRO == 54)
replace inc_value = 0 if missing(inc_value)

* Value of deductions (monthly average)
gen     tax_value = (V8501_DEFLA         * FATOR_ANUALIZACAO)/12
replace tax_value = (V8501_DEFLA * V9011 * FATOR_ANUALIZACAO)/12 if (QUADRO == 54)
replace tax_value = 0 if missing(tax_value)

* Sum the monthly average of all types of other work income 
bysort ind_id: egen other_work_inc_gross = sum(inc_value)

* Sum the monthly average of all types of deductions related to other work income 
bysort ind_id: egen other_work_inc_tax = sum(tax_value)


* Wrap up and export
********************
label var other_work_inc_gross "Other work income, gross, latest pay"
label var other_work_inc_tax   "Other work income, deductions and taxes, latest pay"

global export_set ///
  ind_id other_work_inc_gross other_work_inc_tax
 
keep  $export_set
order $export_set
sort  ind_id
duplicates drop
compress
describe
save "${analysis}/data/2_1_pof_ind_other_work_inc_summary.dta", replace


*********************************************************************************
* MORADOR + DOMICILIO + INVENTARIO + CONDICOES DE VIDA + RENDIMENTO DO TRABALHO *
*********************************************************************************
use "${analysis}/data/2_1_pof_morador_summary.dta", clear
merge m:1 hu_id  using "${analysis}/data/2_1_pof_domicilio_summary.dta", nogenerate
merge m:1 fam_id using "${analysis}/data/2_1_pof_inventario_summary.dta", nogenerate
merge m:1 fam_id using "${analysis}/data/2_1_pof_condicoes_vida_summary.dta", nogenerate
merge m:1 ind_id using "${analysis}/data/2_1_pof_ind_job_inc_summary.dta", nogenerate
merge m:1 ind_id using "${analysis}/data/2_1_pof_ind_other_work_inc_summary.dta", nogenerate

* Work state summary (IBGE default)
tab main_job_ibge, missing
label list work_state

gen     work_state_ibge_name = ""
replace work_state_ibge_name = "Own-account worker" if main_job_ibge == "Own-account worker":work_state
replace work_state_ibge_name = "Employee"           if main_job_ibge == "Employee":work_state
replace work_state_ibge_name = "Employer"           if main_job_ibge == "Employer":work_state
replace work_state_ibge_name = "Aux worker"         if main_job_ibge == "Aux worker":work_state
replace work_state_ibge_name = "Below working age"  if years_age < 14
replace work_state_ibge_name = "Inactive"           if work_state_ibge_name == ""

label var work_state_ibge_name "Status in the labor market (IBGE default)"
tab work_state_ibge_name, missing

encode work_state_ibge_name, gen(work_state_ibge) label(work_state)
label var work_state_ibge "Status in the labor market (IBGE default)"
tabulate work_state_ibge, missing

* Work state summary (our classification)
tab main_job, missing
label list work_state

gen     work_state_name = ""
replace work_state_name = "Own-account worker" if main_job == "Own-account worker":work_state
replace work_state_name = "Employee"           if main_job == "Employee":work_state
replace work_state_name = "Employer"           if main_job == "Employer":work_state
replace work_state_name = "Below working age"  if years_age < 14
replace work_state_name = "Inactive"           if work_state_name == ""

label var work_state_name "Status in the labor market"
tab work_state_name, missing

encode work_state_name, gen(work_state) label(work_state)
label var work_state "Status in the labor market"
tabulate work_state, missing

* Check
tab work_state work_state_ibge

* Individuals per room in the house
gen people_per_sleep_room = family_size / n_sleep_rooms
label var people_per_sleep_room "People per sleeping room"
drop n_sleep_rooms

* Net available work income, most recent
replace other_work_inc_gross = 0 if missing(other_work_inc_gross)
replace other_work_inc_tax = 0   if missing(other_work_inc_tax)
gen winc = main_job_gross_inc + other_work_inc_gross - main_job_deductions - other_work_inc_tax
label var winc "Net available work income, main job, latest pay"

* Define log available work income
gen ln_winc          = log(winc)
gen ln_winc_oaw      = log(winc) if work_state_name == "Own-account worker"
gen ln_winc_employee = log(winc) if work_state_name == "Employee"
gen ln_winc_employer = log(winc) if work_state_name == "Employer"

label var ln_winc          "Log work income"
label var ln_winc_oaw      "Own-account workers' log work income"
label var ln_winc_employee "Employees' log work income"
label var ln_winc_employer "Employers' log work income"


* Wrap up and export
********************
global export_set ///
  state region strata_id urban psu_id pweight hu_id fam_id ind_id ///
  work_state work_state_name work_state_ibge work_state_ibge_name occupation sector formal ///
  family_position male white race_gender years_age age years_educ educ attending_school ///
  health_plan no_credit_card no_current_account no_savings_account no_overdraft_facility ///
  winc ln_winc ln_winc_oaw ln_winc_employee ln_winc_employer ///
  main_job_gross_inc main_job_deductions main_job_work_hours main_job_full_time transit_time main_job_month ///
  other_work_inc_gross other_work_inc_tax ///
  family_size n_kids n_youngs n_adults n_seniors ///
  gross_inc_total net_inc_pc /// 
  house_own house_rented house_other has_fridge has_tv has_pc has_car has_new_fridge has_new_tv has_new_pc has_new_car ///
  people_per_sleep_room piped_water_lack waste_lack energy_lack unpaved_street food_insec ///
  ends_meet good_cond_* bad_cond_* good_infr_* bad_infr_* house_* food_* ///
  rg_* fp_* age_* educ_* ae_* at_* region_*

keep  $export_set
order $export_set
sort strata_id ind_id
duplicates drop
compress
describe

* How many households?
codebook hu_id 

* How many families?
codebook fam_id

* How many individuals?
codebook ind_id

* Export
save "${analysis}/data/2_1_pof_attributes.dta", replace


* Housekeeping
**************
erase "${analysis}/data/2_1_pof_morador_summary.dta"
erase "${analysis}/data/2_1_pof_domicilio_summary.dta"
erase "${analysis}/data/2_1_pof_inventario_summary.dta"
erase "${analysis}/data/2_1_pof_condicoes_vida_summary.dta"
erase "${analysis}/data/2_1_pof_ind_job_inc_summary.dta"
erase "${analysis}/data/2_1_pof_ind_other_work_inc_summary.dta"


* End of script
***************
cap log close